
*********************** LOAD DATA FOR ANALYSIS ***********************************

clear
import delim using LifeScience_manuscript, varn(1) case(preserve)

**********************************************************************************

*Create authorship variables 
gen a_ship = 1
gen female_aship = cond(sex == 1,1,0)
gen male_aship = cond(sex == 0,1,0)
bysort pmid (author_order): gen nvals = _n == 1  
tab nvals
gen first_female = cond(author_order == 1, cond(sex == 1,1,0),0)  
gen last_female = cond(author_order == n_authors, cond(sex == 1,1,0),0)  
gen first_male = cond(author_order == 1, cond(sex == 0,1,0),0)  
gen last_male = cond(author_order == n_authors, cond(sex == 0,1,0),0)  
gen inter_female = cond(author_order > 1, cond(author_order < n_authors, cond(sex == 1,1,0),0),0)
gen inter_male = cond(author_order > 1, cond(author_order < n_authors, cond(sex == 0,1,0),0),0)
rename link_year j_date_year

***		CREATE RR STATISTICS BY YEAR 

collapse (sum) pmids = nvals a_ships = a_ship first_female last_female female_aship first_male last_male male_aship inter_female inter_male, by(j_date_year)

*Rates of prestigious authorship: Probability of first (last) female (male) authorship conditional on authorship is female (male)  		
gen cprob_lastm = (last_male/male_aship)
gen cprob_lastf = (last_female/female_aship)
gen cprob_firstf = first_female/(first_female + inter_female)
gen cprob_firstm = first_male/(first_male + inter_male)

*Relative risk of prestigious authorships and confidence intervals
gen ratio_fm_last = cprob_lastf / cprob_lastm
gen p1 = (last_female + 0.5) / (female_aship + 1)
gen p2 = (last_male + 0.5) / (male_aship + 1)
gen CI_U_last = p1/p2*exp(1.96*(((1-p1)/(female_aship*p1))+((1-p2)/(male_aship*p2)))^0.5)   
gen CI_L_last = p1/p2*exp(-1.96*(((1-p1)/(female_aship*p1))+((1-p2)/(male_aship*p2)))^0.5)   
drop p1 p2

gen ratio_fm_first = cprob_firstf/cprob_firstm 
gen p1 = (first_female + 0.5) / (first_female + inter_female + 1)
gen p2 = (first_male + 0.5) / (first_male + inter_male + 1)
gen CI_U_first = p1/p2*exp(1.96*(((1-p1)/((first_female + inter_female)*p1))+((1-p2)/((first_male + inter_male))))^0.5)   
gen CI_L_first = p1/p2*exp(-1.96*(((1-p1)/((first_female + inter_female)*p1))+((1-p2)/((first_male + inter_male))))^0.5)   
drop p1 p2

*Create Supplemental Figure 1
tsset j_date_year, yearly
twoway (tsline ratio_fm_first if tin(1985,2015), lcolor(cranberry)) (tsline ratio_fm_last if tin(1985,2015), lcolor(blue)) (tsline CI_U_last, lcolor(gray) lpattern(dot)) (tsline CI_L_last,/*
*/ lcolor(gray) lpattern(dot)) (tsline CI_U_first, lcolor(gray) lpattern(vshortdash)) (tsline CI_L_first, lcolor(gray) lpattern(vshortdash)), /*
*/ xtitle(Year of publication) ytitle(Relative risk of prestigious authorships) ytitle(, size(small)) yscale(range(0 1.5)) xlabel(1985 (5) 2015) yline(1, lwidth(medium) lpattern(longdash)/*
*/ lcolor(green)) title("Relative risk of prestigious authorship women vs. men", size(medium))/*
*/ subtitle("600,747 articles with 2,699,061 authorships (1985-2015)", size(small))/*
*/ legend(order(1 "Relative risk of first authorship women versus men"/*
*/ 2 "Relative risk of last authorship women versus men") rows(2) size(vsmall))
